In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from load_utils import *
In [2]:
d = load_diffs()
df_events, df_blocked_user_text = load_block_events_and_users()
In [3]:
pairs = d['2015'].query('not own_page and not author_anon and not recipient_anon')\
.groupby(['user_text', 'page_title'], as_index = False)['pred_aggression_score']\
.agg({'aggresssivness': np.mean, 'count': len})\
.query('count > 5')\
.assign(key = lambda x: 'From:' + x['user_text'] + ' to:' + x['page_title'],
partner_key = lambda x: 'From:' + x['page_title'] + ' to:' + x['user_text']
)
pairs = pairs.merge(pairs, left_on = 'partner_key', right_on = 'key', how = 'inner' )
In [4]:
sns.jointplot(x = 'aggresssivness_x', y = 'aggresssivness_y', data = pairs)
Out[4]:
In [5]:
t_angry = np.percentile(pairs['aggresssivness_x'], 95)
t_friendly = np.percentile(pairs['aggresssivness_y'], 5)
sns.distplot(pairs.query('aggresssivness_x > %f' % t_angry)['aggresssivness_y'], hist=False, label = 'Angry A->B')
sns.distplot(pairs.query('aggresssivness_x < %f' % t_friendly)['aggresssivness_y'], hist=False, label = 'Friendly A->B')
plt.xlabel('Aggresiveness B->A')
Out[5]:
Methodology 2: is the aggression score of what A says on B's page related to the score of the next thing B says on A's page?
In [7]:
cols = ['user_text', 'page_title', 'pred_aggression_score', 'rev_timestamp', 'rev_id']
ab = d['2015'].query('not own_page and not author_anon and not recipient_anon')[cols]
ba = ab.copy().rename(columns = {'user_text': 'page_title', 'page_title': 'user_text'})[cols]
micro_pairs = ab.merge(ba, on = ['user_text', 'page_title'], how = 'inner' )\
.assign(delta = lambda x: x['rev_timestamp_x'] - x['rev_timestamp_y'])\
.assign(delta_positive = lambda x: x.delta > pd.Timedelta('0 seconds'),
delta_less_30 = lambda x: x.delta < pd.Timedelta('30 days'))\
.query('delta_positive and delta_less_30')\
.sort('delta', ascending=False)\
.groupby('rev_id_x', as_index=False).first()
In [8]:
sns.jointplot(x = 'pred_aggression_score_x', y = 'pred_aggression_score_y', data = micro_pairs)
Out[8]:
In [9]:
t_friendly, t_neutral, t_angry = np.percentile(micro_pairs['pred_aggression_score_x'], (5, 50, 95))
sns.distplot(micro_pairs.query('pred_aggression_score_x > %f' % t_angry)['pred_aggression_score_y'], hist=False, label = 'Angry A->B')
sns.distplot(micro_pairs.query('pred_aggression_score_x < %f' % t_friendly)['pred_aggression_score_y'], hist=False, label = 'Friendly A->B')
plt.xlabel('Aggression B->A')
Out[9]:
In [16]:
out_score = d['2015'].query('not own_page and not author_anon and not recipient_anon')\
.groupby(['user_text'], as_index = False)['pred_aggression_score']\
.agg({'out_score': np.mean, 'count': len})\
.query('count > 5')
in_score = d['2015'].query('not own_page and not author_anon and not recipient_anon')\
.groupby(['page_title'], as_index = False)['pred_aggression_score']\
.agg({'in_score': np.mean, 'count': len})\
.query('count > 5')\
.rename(columns = {'page_title':'user_text'})
in_out = out_score.merge(in_score, how = 'inner', on = 'user_text')
in_out['saintliness'] = in_out['out_score'] - in_out['in_score']
In [17]:
sns.jointplot(x = 'in_score', y = 'out_score', data = in_out)
Out[17]:
In [18]:
sns.distplot(in_out['saintliness'].dropna(), kde =False, norm_hist = True)
Out[18]:
In [19]:
# Saints
in_out.sort_values('saintliness').head(5)
Out[19]:
In [20]:
# Saints
in_out.sort_values('saintliness').query('in_score > 0 and out_score < 0' ).head(5)
Out[20]:
In [26]:
#d['2015'].query("user_text == 'Parenchyma18'")
In [22]:
# Saints
in_out.sort_values('saintliness').query('in_score > 0 and out_score < 0' ).head(5)
Out[22]:
In [23]:
# Provocateurs
in_out.sort_values('saintliness', ascending = False).head(5)
Out[23]:
In [24]:
# Provocateurs
in_out.sort_values('saintliness', ascending = False).query('out_score > 0 and in_score < 0').head(5)
Out[24]: